# -*-encoding:utf-8-*-
import requests
import re
from bs4 import BeautifulSoup
import csv
import threading
from time import gmtime, strftime


host='http://gupiao.51nb.com/'
lock = threading.Lock()
datas=[]
threads=[]

def log(data):
	with open('51nb.txt', 'a') as f:
		f.write(data.encode('utf8','ignore'))

def GetAllLinks():
	dic={}
	url=host+'forum.php?mod=forumdisplay&fid=41'
	print url
	r=requests.get(url, stream=True)
	html=r.content#.decode('gbk').encode('utf-8')
	#print r.encoding
	soup =BeautifulSoup(html, 'html.parser')
	moderate=soup.find(id='second1')#帖子板块
	for index,li in enumerate(moderate.find_all('li')):
		if index==0 or index>13:
			pass
		else:
			href=li.find('a')['href']
			text=li.find('a').contents[0]
			dic[text]=host+href#+'&orderby=dateline'
	return dic

class Topic(threading.local):
	def __init__(self, name, url):
		self.name = name
		self.basic_url = url
		self.page_num=3
		self.list=[]

	def FillListFromPage(self):
		for i in range(self.page_num):
			self.url=self.basic_url+'&page='+str(i+1)
			print self.name+u'开始抓取第'+str(i+1)+u'页数据,页面链接：'+self.url
			r=requests.get(self.url, stream=True)
			soup =BeautifulSoup(r.content, 'html.parser')
			tbodys=soup.find_all(id=re.compile('normalthread_[0-9]{6}'))#帖子板块
			for tbody in tbodys:
				tid=tbody['id']
				result=re.match('normalthread_[0-9]{6}',tid)
				self.list.append(self.GetDicFromTbody(tbody))
			print self.name+u'抓取第'+str(i+1)+u'页数据完成'
		return self.list

	def GetDicFromTbody(self,tbody):
		dic={'plate':self.name,'page_url':self.url}
		dic['title']=tbody.find('th').select_one('.s.xst').get_text()
		dic['href']=host+tbody.find('th').find_all('a')[1]['href']
		dic['seller']=tbody.select_one('td.author').get_text().strip()
		dic['seller_url']=host+tbody.select_one('td.author').find('a')['href']
		dic['price']=tbody.select('td.num')[1].string
		dic['place']=tbody.select('td.num')[2].string
		dic['time']=tbody.select('td.num')[5].get_text().strip()
		dic['last_responser']=tbody.select_one('td.by').find('a').string
		dic['last_response_time']=tbody.select_one('td.by').find_all('a')[1].string

		_card_url=dic['seller_url']+'&ajaxmenu=1&inajax=1'
		r=requests.get(_card_url, stream=True)#此处爬取的是xml格式的字符串
		#print r.headers['Content-Type']
		soup =BeautifulSoup(r.content, 'html.parser')
		soup =BeautifulSoup(soup.root.string, 'html.parser')		
		dic['seller_type']=soup.select_one('.y.xg1').string
		for k in dic:
			if(dic[k] is None):
				dic[k]=''
		return dic

def run_thread(name,url):
	t_name=threading.current_thread().name
	print 'thread %s >> begin'%(t_name)
	
	topic=Topic(name,url)
	l=topic.FillListFromPage()

	global datas
	print 'thread %s >> is locking '%(t_name)
    # 先要获取锁
	lock.acquire()
	try:
		datas+=l
		pass
	finally:
		lock.release()
	print 'thread %s >> lock release'%(t_name)


dic=GetAllLinks()
for x in dic:
	print x
	t=threading.Thread(target=run_thread,name='t'+x.encode('gbk'),args=(x,dic[x]))	
	threads.append(t)
	t.start()
	'''
x=u'笔记本整机'	
t=threading.Thread(target=run_thread,name='t_'+x.encode('gbk'),args=(x,dic[x]))	
threads.append(t)
t.start()'''

	
for t in threads:
	t.join()

file_name='51nb2.csv'
with open(file_name, 'wb') as f:
	f.write(u'\ufeff'.encode('utf8'))
titles=[u'交易板块',u'交易主题',u'卖家',u'卖家类型',u'商品价格',u'发帖时间',u'商品链接',u'卖家资料',u'交易地点',u'最后回复时间',u'最后回复ID',u'信息来源页面链接']
with open(file_name, 'ab+') as f:
	writer = csv.writer(f)
	writer.writerow([title.encode('utf8') for title in titles])

with open(file_name, 'ab+') as f:
	writer = csv.writer(f)
	for dic in datas:
		l=[dic['plate'],dic['title'],dic['seller'],dic['seller_type'],dic['price'],dic['time'],dic['href'],dic['seller_url'],dic['place'],dic['last_responser'],dic['last_response_time'],dic['page_url']]
		writer.writerow([item.encode('utf8') for item in l])